# First let's import the packages we will use in this project
# You can do this all now or as you need them
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib
plt.style.use('ggplot')
from matplotlib.pyplot import figure
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,8)
pd.options.mode.chained_assignment = None
# Now we need to read in the data
df = pd.read_csv(r'C:\Users\Heng Kimhak\Downloads\movies.csv')
# Let's lookk at the data
df.head()
name | rating | genre | year | released | score | votes | director | writer | star | country | budget | gross | company | runtime | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | The Shining | R | Drama | 1980 | June 13, 1980 (United States) | 8.4 | 927000.0 | Stanley Kubrick | Stephen King | Jack Nicholson | United Kingdom | 19000000.0 | 46998772.0 | Warner Bros. | 146.0 |
1 | The Blue Lagoon | R | Adventure | 1980 | July 2, 1980 (United States) | 5.8 | 65000.0 | Randal Kleiser | Henry De Vere Stacpoole | Brooke Shields | United States | 4500000.0 | 58853106.0 | Columbia Pictures | 104.0 |
2 | Star Wars: Episode V - The Empire Strikes Back | PG | Action | 1980 | June 20, 1980 (United States) | 8.7 | 1200000.0 | Irvin Kershner | Leigh Brackett | Mark Hamill | United States | 18000000.0 | 538375067.0 | Lucasfilm | 124.0 |
3 | Airplane! | PG | Comedy | 1980 | July 2, 1980 (United States) | 7.7 | 221000.0 | Jim Abrahams | Jim Abrahams | Robert Hays | United States | 3500000.0 | 83453539.0 | Paramount Pictures | 88.0 |
4 | Caddyshack | R | Comedy | 1980 | July 25, 1980 (United States) | 7.3 | 108000.0 | Harold Ramis | Brian Doyle-Murray | Chevy Chase | United States | 6000000.0 | 39846344.0 | Orion Pictures | 98.0 |
# We need to see if we have any missing data
# Let's loop through the data and see if there is anything missing
for col in df.columns:
pct_missing = np.mean(df[col].isnull())
print('{} - {}%'.format(col, round(pct_missing*100)))
name - 0% rating - 1% genre - 0% year - 0% released - 0% score - 0% votes - 0% director - 0% writer - 0% star - 0% country - 0% budget - 28% gross - 2% company - 0% runtime - 0%
# Data types for our columns
df.dtypes
name object rating object genre object year int64 released object score float64 votes float64 director object writer object star object country object budget float64 gross float64 company object runtime float64 dtype: object
# Change data type of columns
# df['budget'] = df['budget'].astype('Int64')
# df['gross'] = df['gross'].astype('Int64')
# df.dtypes
# Create correct Year column
df['yearcorrect'] = df['released'].astype(str).str[:4]
df.head()
name | rating | genre | year | released | score | votes | director | writer | star | country | budget | gross | company | runtime | yearcorrect | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | The Shining | R | Drama | 1980 | June 13, 1980 (United States) | 8.4 | 927000.0 | Stanley Kubrick | Stephen King | Jack Nicholson | United Kingdom | 19000000.0 | 46998772.0 | Warner Bros. | 146.0 | June |
1 | The Blue Lagoon | R | Adventure | 1980 | July 2, 1980 (United States) | 5.8 | 65000.0 | Randal Kleiser | Henry De Vere Stacpoole | Brooke Shields | United States | 4500000.0 | 58853106.0 | Columbia Pictures | 104.0 | July |
2 | Star Wars: Episode V - The Empire Strikes Back | PG | Action | 1980 | June 20, 1980 (United States) | 8.7 | 1200000.0 | Irvin Kershner | Leigh Brackett | Mark Hamill | United States | 18000000.0 | 538375067.0 | Lucasfilm | 124.0 | June |
3 | Airplane! | PG | Comedy | 1980 | July 2, 1980 (United States) | 7.7 | 221000.0 | Jim Abrahams | Jim Abrahams | Robert Hays | United States | 3500000.0 | 83453539.0 | Paramount Pictures | 88.0 | July |
4 | Caddyshack | R | Comedy | 1980 | July 25, 1980 (United States) | 7.3 | 108000.0 | Harold Ramis | Brian Doyle-Murray | Chevy Chase | United States | 6000000.0 | 39846344.0 | Orion Pictures | 98.0 | July |
df[["1", "2"]] = df["released"].str.split(",", expand = True)
df.head()
name | rating | genre | year | released | score | votes | director | writer | star | country | budget | gross | company | runtime | yearcorrect | 1 | 2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | The Shining | R | Drama | 1980 | June 13, 1980 (United States) | 8.4 | 927000.0 | Stanley Kubrick | Stephen King | Jack Nicholson | United Kingdom | 19000000.0 | 46998772.0 | Warner Bros. | 146.0 | June | June 13 | 1980 (United States) |
1 | The Blue Lagoon | R | Adventure | 1980 | July 2, 1980 (United States) | 5.8 | 65000.0 | Randal Kleiser | Henry De Vere Stacpoole | Brooke Shields | United States | 4500000.0 | 58853106.0 | Columbia Pictures | 104.0 | July | July 2 | 1980 (United States) |
2 | Star Wars: Episode V - The Empire Strikes Back | PG | Action | 1980 | June 20, 1980 (United States) | 8.7 | 1200000.0 | Irvin Kershner | Leigh Brackett | Mark Hamill | United States | 18000000.0 | 538375067.0 | Lucasfilm | 124.0 | June | June 20 | 1980 (United States) |
3 | Airplane! | PG | Comedy | 1980 | July 2, 1980 (United States) | 7.7 | 221000.0 | Jim Abrahams | Jim Abrahams | Robert Hays | United States | 3500000.0 | 83453539.0 | Paramount Pictures | 88.0 | July | July 2 | 1980 (United States) |
4 | Caddyshack | R | Comedy | 1980 | July 25, 1980 (United States) | 7.3 | 108000.0 | Harold Ramis | Brian Doyle-Murray | Chevy Chase | United States | 6000000.0 | 39846344.0 | Orion Pictures | 98.0 | July | July 25 | 1980 (United States) |
df['2'] = df['2'].astype(str).str[1:5]
df.head()
name | rating | genre | year | released | score | votes | director | writer | star | country | budget | gross | company | runtime | yearcorrect | 1 | 2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | The Shining | R | Drama | 1980 | June 13, 1980 (United States) | 8.4 | 927000.0 | Stanley Kubrick | Stephen King | Jack Nicholson | United Kingdom | 19000000.0 | 46998772.0 | Warner Bros. | 146.0 | June | June 13 | 1980 |
1 | The Blue Lagoon | R | Adventure | 1980 | July 2, 1980 (United States) | 5.8 | 65000.0 | Randal Kleiser | Henry De Vere Stacpoole | Brooke Shields | United States | 4500000.0 | 58853106.0 | Columbia Pictures | 104.0 | July | July 2 | 1980 |
2 | Star Wars: Episode V - The Empire Strikes Back | PG | Action | 1980 | June 20, 1980 (United States) | 8.7 | 1200000.0 | Irvin Kershner | Leigh Brackett | Mark Hamill | United States | 18000000.0 | 538375067.0 | Lucasfilm | 124.0 | June | June 20 | 1980 |
3 | Airplane! | PG | Comedy | 1980 | July 2, 1980 (United States) | 7.7 | 221000.0 | Jim Abrahams | Jim Abrahams | Robert Hays | United States | 3500000.0 | 83453539.0 | Paramount Pictures | 88.0 | July | July 2 | 1980 |
4 | Caddyshack | R | Comedy | 1980 | July 25, 1980 (United States) | 7.3 | 108000.0 | Harold Ramis | Brian Doyle-Murray | Chevy Chase | United States | 6000000.0 | 39846344.0 | Orion Pictures | 98.0 | July | July 25 | 1980 |
df['yearcorrect'] = df['2']
df.head()
name | rating | genre | year | released | score | votes | director | writer | star | country | budget | gross | company | runtime | yearcorrect | 1 | 2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | The Shining | R | Drama | 1980 | June 13, 1980 (United States) | 8.4 | 927000.0 | Stanley Kubrick | Stephen King | Jack Nicholson | United Kingdom | 19000000.0 | 46998772.0 | Warner Bros. | 146.0 | 1980 | June 13 | 1980 |
1 | The Blue Lagoon | R | Adventure | 1980 | July 2, 1980 (United States) | 5.8 | 65000.0 | Randal Kleiser | Henry De Vere Stacpoole | Brooke Shields | United States | 4500000.0 | 58853106.0 | Columbia Pictures | 104.0 | 1980 | July 2 | 1980 |
2 | Star Wars: Episode V - The Empire Strikes Back | PG | Action | 1980 | June 20, 1980 (United States) | 8.7 | 1200000.0 | Irvin Kershner | Leigh Brackett | Mark Hamill | United States | 18000000.0 | 538375067.0 | Lucasfilm | 124.0 | 1980 | June 20 | 1980 |
3 | Airplane! | PG | Comedy | 1980 | July 2, 1980 (United States) | 7.7 | 221000.0 | Jim Abrahams | Jim Abrahams | Robert Hays | United States | 3500000.0 | 83453539.0 | Paramount Pictures | 88.0 | 1980 | July 2 | 1980 |
4 | Caddyshack | R | Comedy | 1980 | July 25, 1980 (United States) | 7.3 | 108000.0 | Harold Ramis | Brian Doyle-Murray | Chevy Chase | United States | 6000000.0 | 39846344.0 | Orion Pictures | 98.0 | 1980 | July 25 | 1980 |
# df.pop('1')
# df.pop('2')
df.head()
name | rating | genre | year | released | score | votes | director | writer | star | country | budget | gross | company | runtime | yearcorrect | 1 | 2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | The Shining | R | Drama | 1980 | June 13, 1980 (United States) | 8.4 | 927000.0 | Stanley Kubrick | Stephen King | Jack Nicholson | United Kingdom | 19000000.0 | 46998772.0 | Warner Bros. | 146.0 | 1980 | June 13 | 1980 |
1 | The Blue Lagoon | R | Adventure | 1980 | July 2, 1980 (United States) | 5.8 | 65000.0 | Randal Kleiser | Henry De Vere Stacpoole | Brooke Shields | United States | 4500000.0 | 58853106.0 | Columbia Pictures | 104.0 | 1980 | July 2 | 1980 |
2 | Star Wars: Episode V - The Empire Strikes Back | PG | Action | 1980 | June 20, 1980 (United States) | 8.7 | 1200000.0 | Irvin Kershner | Leigh Brackett | Mark Hamill | United States | 18000000.0 | 538375067.0 | Lucasfilm | 124.0 | 1980 | June 20 | 1980 |
3 | Airplane! | PG | Comedy | 1980 | July 2, 1980 (United States) | 7.7 | 221000.0 | Jim Abrahams | Jim Abrahams | Robert Hays | United States | 3500000.0 | 83453539.0 | Paramount Pictures | 88.0 | 1980 | July 2 | 1980 |
4 | Caddyshack | R | Comedy | 1980 | July 25, 1980 (United States) | 7.3 | 108000.0 | Harold Ramis | Brian Doyle-Murray | Chevy Chase | United States | 6000000.0 | 39846344.0 | Orion Pictures | 98.0 | 1980 | July 25 | 1980 |
df.sort_values(by=['gross'], inplace=False, ascending=False)
name | rating | genre | year | released | score | votes | director | writer | star | country | budget | gross | company | runtime | yearcorrect | 1 | 2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
5445 | Avatar | PG-13 | Action | 2009 | December 18, 2009 (United States) | 7.8 | 1100000.0 | James Cameron | James Cameron | Sam Worthington | United States | 237000000.0 | 2.847246e+09 | Twentieth Century Fox | 162.0 | 2009 | December 18 | 2009 |
7445 | Avengers: Endgame | PG-13 | Action | 2019 | April 26, 2019 (United States) | 8.4 | 903000.0 | Anthony Russo | Christopher Markus | Robert Downey Jr. | United States | 356000000.0 | 2.797501e+09 | Marvel Studios | 181.0 | 2019 | April 26 | 2019 |
3045 | Titanic | PG-13 | Drama | 1997 | December 19, 1997 (United States) | 7.8 | 1100000.0 | James Cameron | James Cameron | Leonardo DiCaprio | United States | 200000000.0 | 2.201647e+09 | Twentieth Century Fox | 194.0 | 1997 | December 19 | 1997 |
6663 | Star Wars: Episode VII - The Force Awakens | PG-13 | Action | 2015 | December 18, 2015 (United States) | 7.8 | 876000.0 | J.J. Abrams | Lawrence Kasdan | Daisy Ridley | United States | 245000000.0 | 2.069522e+09 | Lucasfilm | 138.0 | 2015 | December 18 | 2015 |
7244 | Avengers: Infinity War | PG-13 | Action | 2018 | April 27, 2018 (United States) | 8.4 | 897000.0 | Anthony Russo | Christopher Markus | Robert Downey Jr. | United States | 321000000.0 | 2.048360e+09 | Marvel Studios | 149.0 | 2018 | April 27 | 2018 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
7663 | More to Life | NaN | Drama | 2020 | October 23, 2020 (United States) | 3.1 | 18.0 | Joseph Ebanks | Joseph Ebanks | Shannon Bond | United States | 7000.0 | NaN | NaN | 90.0 | 2020 | October 23 | 2020 |
7664 | Dream Round | NaN | Comedy | 2020 | February 7, 2020 (United States) | 4.7 | 36.0 | Dusty Dukatz | Lisa Huston | Michael Saquella | United States | NaN | NaN | Cactus Blue Entertainment | 90.0 | 2020 | February 7 | 2020 |
7665 | Saving Mbango | NaN | Drama | 2020 | April 27, 2020 (Cameroon) | 5.7 | 29.0 | Nkanya Nkwai | Lynno Lovert | Onyama Laura | United States | 58750.0 | NaN | Embi Productions | NaN | 2020 | April 27 | 2020 |
7666 | It's Just Us | NaN | Drama | 2020 | October 1, 2020 (United States) | NaN | NaN | James Randall | James Randall | Christina Roz | United States | 15000.0 | NaN | NaN | 120.0 | 2020 | October 1 | 2020 |
7667 | Tee em el | NaN | Horror | 2020 | August 19, 2020 (United States) | 5.7 | 7.0 | Pereko Mosia | Pereko Mosia | Siyabonga Mabaso | South Africa | NaN | NaN | PK 65 Films | 102.0 | 2020 | August 19 | 2020 |
7668 rows × 18 columns
pd.set_option('display.max_rows', None)
# Drop any duplicates
df['company'].drop_duplicates().sort_values(ascending=False).head()
7129 thefyzz 5664 micro_scope 6412 iDeal Partners Film Fund 4007 i5 Films 6793 i am OTHER Name: company, dtype: object
df['company'].sort_values(ascending=False).head()
7129 thefyzz 5664 micro_scope 6412 iDeal Partners Film Fund 4007 i5 Films 6793 i am OTHER Name: company, dtype: object
df.head()
name | rating | genre | year | released | score | votes | director | writer | star | country | budget | gross | company | runtime | yearcorrect | 1 | 2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | The Shining | R | Drama | 1980 | June 13, 1980 (United States) | 8.4 | 927000.0 | Stanley Kubrick | Stephen King | Jack Nicholson | United Kingdom | 19000000.0 | 46998772.0 | Warner Bros. | 146.0 | 1980 | June 13 | 1980 |
1 | The Blue Lagoon | R | Adventure | 1980 | July 2, 1980 (United States) | 5.8 | 65000.0 | Randal Kleiser | Henry De Vere Stacpoole | Brooke Shields | United States | 4500000.0 | 58853106.0 | Columbia Pictures | 104.0 | 1980 | July 2 | 1980 |
2 | Star Wars: Episode V - The Empire Strikes Back | PG | Action | 1980 | June 20, 1980 (United States) | 8.7 | 1200000.0 | Irvin Kershner | Leigh Brackett | Mark Hamill | United States | 18000000.0 | 538375067.0 | Lucasfilm | 124.0 | 1980 | June 20 | 1980 |
3 | Airplane! | PG | Comedy | 1980 | July 2, 1980 (United States) | 7.7 | 221000.0 | Jim Abrahams | Jim Abrahams | Robert Hays | United States | 3500000.0 | 83453539.0 | Paramount Pictures | 88.0 | 1980 | July 2 | 1980 |
4 | Caddyshack | R | Comedy | 1980 | July 25, 1980 (United States) | 7.3 | 108000.0 | Harold Ramis | Brian Doyle-Murray | Chevy Chase | United States | 6000000.0 | 39846344.0 | Orion Pictures | 98.0 | 1980 | July 25 | 1980 |
# Budget high correlation
# Company high correlation
plt.scatter(x=df['budget'], y=df['gross'])
plt.title('Budget vs Gross Earnings')
plt.xlabel('Gross Earnings')
plt.ylabel('Budget for Films')
plt.show()
df.head()
name | rating | genre | year | released | score | votes | director | writer | star | country | budget | gross | company | runtime | yearcorrect | 1 | 2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | The Shining | R | Drama | 1980 | June 13, 1980 (United States) | 8.4 | 927000.0 | Stanley Kubrick | Stephen King | Jack Nicholson | United Kingdom | 19000000.0 | 46998772.0 | Warner Bros. | 146.0 | 1980 | June 13 | 1980 |
1 | The Blue Lagoon | R | Adventure | 1980 | July 2, 1980 (United States) | 5.8 | 65000.0 | Randal Kleiser | Henry De Vere Stacpoole | Brooke Shields | United States | 4500000.0 | 58853106.0 | Columbia Pictures | 104.0 | 1980 | July 2 | 1980 |
2 | Star Wars: Episode V - The Empire Strikes Back | PG | Action | 1980 | June 20, 1980 (United States) | 8.7 | 1200000.0 | Irvin Kershner | Leigh Brackett | Mark Hamill | United States | 18000000.0 | 538375067.0 | Lucasfilm | 124.0 | 1980 | June 20 | 1980 |
3 | Airplane! | PG | Comedy | 1980 | July 2, 1980 (United States) | 7.7 | 221000.0 | Jim Abrahams | Jim Abrahams | Robert Hays | United States | 3500000.0 | 83453539.0 | Paramount Pictures | 88.0 | 1980 | July 2 | 1980 |
4 | Caddyshack | R | Comedy | 1980 | July 25, 1980 (United States) | 7.3 | 108000.0 | Harold Ramis | Brian Doyle-Murray | Chevy Chase | United States | 6000000.0 | 39846344.0 | Orion Pictures | 98.0 | 1980 | July 25 | 1980 |
# Plot budget vs gross using seaborn
sns.regplot(x='budget', y='gross', data=df, scatter_kws={"color": "red"}, line_kws={"color": "blue"})
<AxesSubplot:xlabel='budget', ylabel='gross'>
# Let's start looking at correlation
df.corr(method='pearson') #pearson, kendall, spearman
year | score | votes | budget | gross | runtime | |
---|---|---|---|---|---|---|
year | 1.000000 | 0.097995 | 0.222945 | 0.329321 | 0.257486 | 0.120811 |
score | 0.097995 | 1.000000 | 0.409182 | 0.076254 | 0.186258 | 0.399451 |
votes | 0.222945 | 0.409182 | 1.000000 | 0.442429 | 0.630757 | 0.309212 |
budget | 0.329321 | 0.076254 | 0.442429 | 1.000000 | 0.740395 | 0.320447 |
gross | 0.257486 | 0.186258 | 0.630757 | 0.740395 | 1.000000 | 0.245216 |
runtime | 0.120811 | 0.399451 | 0.309212 | 0.320447 | 0.245216 | 1.000000 |
# High correlation between budget and gross
correlation_matrix = df.corr(method='pearson')
sns.heatmap(correlation_matrix, annot=True)
plt.title('Correlatoin Matric for Numeric Features')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()
# Looks at Company
df.head()
name | rating | genre | year | released | score | votes | director | writer | star | country | budget | gross | company | runtime | yearcorrect | 1 | 2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | The Shining | R | Drama | 1980 | June 13, 1980 (United States) | 8.4 | 927000.0 | Stanley Kubrick | Stephen King | Jack Nicholson | United Kingdom | 19000000.0 | 46998772.0 | Warner Bros. | 146.0 | 1980 | June 13 | 1980 |
1 | The Blue Lagoon | R | Adventure | 1980 | July 2, 1980 (United States) | 5.8 | 65000.0 | Randal Kleiser | Henry De Vere Stacpoole | Brooke Shields | United States | 4500000.0 | 58853106.0 | Columbia Pictures | 104.0 | 1980 | July 2 | 1980 |
2 | Star Wars: Episode V - The Empire Strikes Back | PG | Action | 1980 | June 20, 1980 (United States) | 8.7 | 1200000.0 | Irvin Kershner | Leigh Brackett | Mark Hamill | United States | 18000000.0 | 538375067.0 | Lucasfilm | 124.0 | 1980 | June 20 | 1980 |
3 | Airplane! | PG | Comedy | 1980 | July 2, 1980 (United States) | 7.7 | 221000.0 | Jim Abrahams | Jim Abrahams | Robert Hays | United States | 3500000.0 | 83453539.0 | Paramount Pictures | 88.0 | 1980 | July 2 | 1980 |
4 | Caddyshack | R | Comedy | 1980 | July 25, 1980 (United States) | 7.3 | 108000.0 | Harold Ramis | Brian Doyle-Murray | Chevy Chase | United States | 6000000.0 | 39846344.0 | Orion Pictures | 98.0 | 1980 | July 25 | 1980 |
df_numerized = df
for col_name in df_numerized.columns:
if(df_numerized[col_name].dtype == 'object'):
df_numerized[col_name] = df_numerized[col_name].astype('category')
df_numerized[col_name] = df_numerized[col_name].cat.codes
df_numerized.head()
name | rating | genre | year | released | score | votes | director | writer | star | country | budget | gross | company | runtime | yearcorrect | 1 | 2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 6587 | 6 | 6 | 1980 | 1705 | 8.4 | 927000.0 | 2589 | 4014 | 1047 | 54 | 19000000.0 | 46998772.0 | 2319 | 146.0 | 0 | 212 | 0 |
1 | 5573 | 6 | 1 | 1980 | 1492 | 5.8 | 65000.0 | 2269 | 1632 | 327 | 55 | 4500000.0 | 58853106.0 | 731 | 104.0 | 0 | 188 | 0 |
2 | 5142 | 4 | 0 | 1980 | 1771 | 8.7 | 1200000.0 | 1111 | 2567 | 1745 | 55 | 18000000.0 | 538375067.0 | 1540 | 124.0 | 0 | 223 | 0 |
3 | 286 | 4 | 4 | 1980 | 1492 | 7.7 | 221000.0 | 1301 | 2000 | 2246 | 55 | 3500000.0 | 83453539.0 | 1812 | 88.0 | 0 | 188 | 0 |
4 | 1027 | 6 | 4 | 1980 | 1543 | 7.3 | 108000.0 | 1054 | 521 | 410 | 55 | 6000000.0 | 39846344.0 | 1777 | 98.0 | 0 | 194 | 0 |
correlation_matrix = df_numerized.corr(method='pearson')
sns.heatmap(correlation_matrix, annot=True)
plt.title('Correlatoin Matric for Numeric Features')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()
df_numerized.corr()
name | rating | genre | year | released | score | votes | director | writer | star | country | budget | gross | company | runtime | yearcorrect | 1 | 2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
name | 1.000000 | -0.008069 | 0.016355 | 0.011453 | -0.011311 | 0.017097 | 0.013088 | 0.009079 | 0.009081 | 0.006472 | -0.010737 | 0.023970 | 0.005533 | 0.009211 | 0.010392 | 0.012875 | -0.011412 | 0.012875 |
rating | -0.008069 | 1.000000 | 0.072423 | 0.008779 | 0.016613 | -0.001314 | 0.033225 | 0.019483 | -0.005921 | 0.013405 | 0.081244 | -0.176002 | -0.107339 | -0.032943 | 0.062145 | 0.009359 | 0.016390 | 0.009359 |
genre | 0.016355 | 0.072423 | 1.000000 | -0.081261 | 0.029822 | 0.027965 | -0.145307 | -0.015258 | 0.006567 | -0.005477 | -0.037615 | -0.356564 | -0.235650 | -0.071067 | -0.052711 | -0.063069 | 0.030043 | -0.063069 |
year | 0.011453 | 0.008779 | -0.081261 | 1.000000 | -0.000695 | 0.097995 | 0.222945 | -0.020795 | -0.008656 | -0.027242 | -0.070938 | 0.329321 | 0.257486 | -0.010431 | 0.120811 | 0.965667 | -0.003741 | 0.965667 |
released | -0.011311 | 0.016613 | 0.029822 | -0.000695 | 1.000000 | 0.042788 | 0.016097 | -0.001478 | -0.002404 | 0.015777 | -0.020427 | 0.014683 | 0.001659 | -0.010474 | 0.000868 | -0.010732 | 0.999759 | -0.010732 |
score | 0.017097 | -0.001314 | 0.027965 | 0.097995 | 0.042788 | 1.000000 | 0.409182 | 0.009559 | 0.019416 | -0.001609 | -0.133348 | 0.076254 | 0.186258 | 0.001030 | 0.399451 | 0.107146 | 0.041366 | 0.107146 |
votes | 0.013088 | 0.033225 | -0.145307 | 0.222945 | 0.016097 | 0.409182 | 1.000000 | 0.000260 | 0.000892 | -0.019282 | 0.073625 | 0.442429 | 0.630757 | 0.133204 | 0.309212 | 0.205894 | 0.015156 | 0.205894 |
director | 0.009079 | 0.019483 | -0.015258 | -0.020795 | -0.001478 | 0.009559 | 0.000260 | 1.000000 | 0.299067 | 0.039234 | 0.017490 | -0.012272 | -0.014441 | 0.004404 | 0.017624 | -0.022644 | -0.001446 | -0.022644 |
writer | 0.009081 | -0.005921 | 0.006567 | -0.008656 | -0.002404 | 0.019416 | 0.000892 | 0.299067 | 1.000000 | 0.027245 | 0.015343 | -0.039451 | -0.023519 | 0.005646 | -0.003511 | -0.010134 | -0.002719 | -0.010134 |
star | 0.006472 | 0.013405 | -0.005477 | -0.027242 | 0.015777 | -0.001609 | -0.019282 | 0.039234 | 0.027245 | 1.000000 | -0.012998 | -0.019589 | -0.002717 | 0.012442 | 0.010174 | -0.031268 | 0.016043 | -0.031268 |
country | -0.010737 | 0.081244 | -0.037615 | -0.070938 | -0.020427 | -0.133348 | 0.073625 | 0.017490 | 0.015343 | -0.012998 | 1.000000 | 0.054063 | 0.092129 | 0.095548 | -0.078412 | -0.091171 | -0.020124 | -0.091171 |
budget | 0.023970 | -0.176002 | -0.356564 | 0.329321 | 0.014683 | 0.076254 | 0.442429 | -0.012272 | -0.039451 | -0.019589 | 0.054063 | 1.000000 | 0.740395 | 0.173214 | 0.320447 | 0.314986 | 0.013792 | 0.314986 |
gross | 0.005533 | -0.107339 | -0.235650 | 0.257486 | 0.001659 | 0.186258 | 0.630757 | -0.014441 | -0.023519 | -0.002717 | 0.092129 | 0.740395 | 1.000000 | 0.154840 | 0.245216 | 0.240118 | 0.000973 | 0.240118 |
company | 0.009211 | -0.032943 | -0.071067 | -0.010431 | -0.010474 | 0.001030 | 0.133204 | 0.004404 | 0.005646 | 0.012442 | 0.095548 | 0.173214 | 0.154840 | 1.000000 | 0.034402 | -0.026825 | -0.010071 | -0.026825 |
runtime | 0.010392 | 0.062145 | -0.052711 | 0.120811 | 0.000868 | 0.399451 | 0.309212 | 0.017624 | -0.003511 | 0.010174 | -0.078412 | 0.320447 | 0.245216 | 0.034402 | 1.000000 | 0.115024 | -0.001234 | 0.115024 |
yearcorrect | 0.012875 | 0.009359 | -0.063069 | 0.965667 | -0.010732 | 0.107146 | 0.205894 | -0.022644 | -0.010134 | -0.031268 | -0.091171 | 0.314986 | 0.240118 | -0.026825 | 0.115024 | 1.000000 | -0.013700 | 1.000000 |
1 | -0.011412 | 0.016390 | 0.030043 | -0.003741 | 0.999759 | 0.041366 | 0.015156 | -0.001446 | -0.002719 | 0.016043 | -0.020124 | 0.013792 | 0.000973 | -0.010071 | -0.001234 | -0.013700 | 1.000000 | -0.013700 |
2 | 0.012875 | 0.009359 | -0.063069 | 0.965667 | -0.010732 | 0.107146 | 0.205894 | -0.022644 | -0.010134 | -0.031268 | -0.091171 | 0.314986 | 0.240118 | -0.026825 | 0.115024 | 1.000000 | -0.013700 | 1.000000 |
correlation_mat = df_numerized.corr()
corr_pairs = correlation_mat.unstack()
corr_pairs.head()
name name 1.000000 rating -0.008069 genre 0.016355 year 0.011453 released -0.011311 dtype: float64
sorted_pairs = corr_pairs.sort_values()
sorted_pairs.head()
budget genre -0.356564 genre budget -0.356564 gross -0.235650 gross genre -0.235650 budget rating -0.176002 dtype: float64
high_corr = sorted_pairs[(sorted_pairs) > 0.5]
high_corr.head()
gross votes 0.630757 votes gross 0.630757 budget gross 0.740395 gross budget 0.740395 yearcorrect year 0.965667 dtype: float64
# Votes and budget have the highest correlation to gross earnings
# Company has low correlations